import os
os.getcwd()
'D:\\PYTHON\\AutoML'
os.listdir()
['.ipynb_checkpoints', 'AutoML_practice.ipynb', 'AutoML_practice2.ipynb', 'AutoViz_Plots', 'auto_ml.ipynb', 'heart.csv', 'plots_optimize', 'README.md', 'scikit_optimize.ipynb', 'sklearn-gridsearchcv-replacement.ipynb', 'test']
import pandas as pd
df = pd.read_csv("heart.csv")
df.head()
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 52 | 1 | 0 | 125 | 212 | 0 | 1 | 168 | 0 | 1.0 | 2 | 2 | 3 | 0 |
| 1 | 53 | 1 | 0 | 140 | 203 | 1 | 0 | 155 | 1 | 3.1 | 0 | 0 | 3 | 0 |
| 2 | 70 | 1 | 0 | 145 | 174 | 0 | 1 | 125 | 1 | 2.6 | 0 | 0 | 3 | 0 |
| 3 | 61 | 1 | 0 | 148 | 203 | 0 | 1 | 161 | 0 | 0.0 | 2 | 1 | 3 | 0 |
| 4 | 62 | 0 | 0 | 138 | 294 | 1 | 1 | 106 | 0 | 1.9 | 1 | 3 | 2 | 0 |
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1025 entries, 0 to 1024 Data columns (total 14 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 age 1025 non-null int64 1 sex 1025 non-null int64 2 cp 1025 non-null int64 3 trestbps 1025 non-null int64 4 chol 1025 non-null int64 5 fbs 1025 non-null int64 6 restecg 1025 non-null int64 7 thalach 1025 non-null int64 8 exang 1025 non-null int64 9 oldpeak 1025 non-null float64 10 slope 1025 non-null int64 11 ca 1025 non-null int64 12 thal 1025 non-null int64 13 target 1025 non-null int64 dtypes: float64(1), int64(13) memory usage: 112.2 KB
df.describe().transpose()
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| age | 1025.0 | 54.434146 | 9.072290 | 29.0 | 48.0 | 56.0 | 61.0 | 77.0 |
| sex | 1025.0 | 0.695610 | 0.460373 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 |
| cp | 1025.0 | 0.942439 | 1.029641 | 0.0 | 0.0 | 1.0 | 2.0 | 3.0 |
| trestbps | 1025.0 | 131.611707 | 17.516718 | 94.0 | 120.0 | 130.0 | 140.0 | 200.0 |
| chol | 1025.0 | 246.000000 | 51.592510 | 126.0 | 211.0 | 240.0 | 275.0 | 564.0 |
| fbs | 1025.0 | 0.149268 | 0.356527 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 |
| restecg | 1025.0 | 0.529756 | 0.527878 | 0.0 | 0.0 | 1.0 | 1.0 | 2.0 |
| thalach | 1025.0 | 149.114146 | 23.005724 | 71.0 | 132.0 | 152.0 | 166.0 | 202.0 |
| exang | 1025.0 | 0.336585 | 0.472772 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 |
| oldpeak | 1025.0 | 1.071512 | 1.175053 | 0.0 | 0.0 | 0.8 | 1.8 | 6.2 |
| slope | 1025.0 | 1.385366 | 0.617755 | 0.0 | 1.0 | 1.0 | 2.0 | 2.0 |
| ca | 1025.0 | 0.754146 | 1.030798 | 0.0 | 0.0 | 0.0 | 1.0 | 4.0 |
| thal | 1025.0 | 2.323902 | 0.620660 | 0.0 | 2.0 | 2.0 | 3.0 | 3.0 |
| target | 1025.0 | 0.513171 | 0.500070 | 0.0 | 0.0 | 1.0 | 1.0 | 1.0 |
df.nunique()
age 41 sex 2 cp 4 trestbps 49 chol 152 fbs 2 restecg 3 thalach 91 exang 2 oldpeak 40 slope 3 ca 5 thal 4 target 2 dtype: int64
df.target
0 0
1 0
2 0
3 0
4 0
..
1020 1
1021 0
1022 0
1023 1
1024 0
Name: target, Length: 1025, dtype: int64
AutoViz 깃헙
https://github.com/AutoViML/AutoViz
usage
filename = "" sep = "," dft = AV.AutoViz( filename, sep=",", depVar="", dfte=None, header=0, verbose=0, lowess=False, chart_format="svg", max_rows_analyzed=150000, max_cols_analyzed=30, save_plot_dir=None )
- Numeric Columns : ['oldpeak']
- Integer-Categorical Columns: ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', 'thal']
- String-Categorical Columns: []
- Factor-Categorical Columns: []
- String-Boolean Columns: []
- Numeric-Boolean Columns: ['sex', 'fbs', 'exang']
- Discrete String Columns: []
- NLP text Columns: []
- Date Time Columns: []
- ID Columns: []
Categorical variables %s (" ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', " "'thal', 'sex', 'fbs', 'exang']")
Continuous variables %s " ['oldpeak']"
%matplotlib inline
from autoviz.AutoViz_Class import AutoViz_Class
import matplotlib.pyplot as plt
# plt.style.use("dark_background")
Imported v0.1.58. After importing, execute '%matplotlib inline' to display charts in Jupyter.
AV = AutoViz_Class()
dfte = AV.AutoViz(filename, sep=',', depVar='', dfte=None, header=0, verbose=1, lowess=False,
chart_format='svg',max_rows_analyzed=150000,max_cols_analyzed=30, save_plot_dir=None)
Update: verbose=0 displays charts in your local Jupyter notebook.
verbose=1 additionally provides EDA data cleaning suggestions. It also displays charts.
verbose=2 does not display charts but saves them in AutoViz_Plots folder in local machine.
chart_format='bokeh' displays charts in your local Jupyter notebook.
chart_format='server' displays charts in your browser: one tab for each chart type
chart_format='html' silently saves interactive HTML files in your local machine
AV = AutoViz_Class()
# 1. 기본 형태인 svg로 저장
# 저장 폴더를 지정하지 않으면 자동으로 `AutoViz_Plots` 폴더를 생성하여
# 출력물을 저장함
AV.AutoViz(filename = '',
dfte = df,
depVar = 'target',
verbose = 2, # 0: 간단히 표시, 1: 자세히 표시, 2: 파일로 저장
max_rows_analyzed = df.shape[0],
max_cols_analyzed = df.shape[1])
Shape of your Data Set loaded: (1025, 14) ####################################################################################### ######################## C L A S S I F Y I N G V A R I A B L E S #################### ####################################################################################### Classifying variables in data set... Data cleaning improvement suggestions. Complete them before proceeding to ML modeling.
| Nuniques | dtype | Nulls | Nullpercent | NuniquePercent | Value counts Min | Data cleaning improvement suggestions | |
|---|---|---|---|---|---|---|---|
| chol | 152 | int64 | 0 | 0.000000 | 14.829268 | 0 | |
| thalach | 91 | int64 | 0 | 0.000000 | 8.878049 | 0 | |
| trestbps | 49 | int64 | 0 | 0.000000 | 4.780488 | 0 | |
| age | 41 | int64 | 0 | 0.000000 | 4.000000 | 0 | |
| oldpeak | 40 | float64 | 0 | 0.000000 | 3.902439 | 0 | skewed: cap or drop outliers |
| ca | 5 | int64 | 0 | 0.000000 | 0.487805 | 0 | |
| cp | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| thal | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| restecg | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| slope | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| sex | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| fbs | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| exang | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 |
Printing upto 30 columns max in each category:
Numeric Columns : ['oldpeak']
Integer-Categorical Columns: ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', 'thal']
String-Categorical Columns: []
Factor-Categorical Columns: []
String-Boolean Columns: []
Numeric-Boolean Columns: ['sex', 'fbs', 'exang']
Discrete String Columns: []
NLP text Columns: []
Date Time Columns: []
ID Columns: []
Columns that will not be considered in modeling: []
13 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 1025 exceeds maximum, randomly sampling 1025 rows for EDA...
################ Binary_Classification problem #####################
Columns to delete:
' []'
Boolean variables %s
" ['sex', 'fbs', 'exang']"
Categorical variables %s
(" ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', "
"'thal', 'sex', 'fbs', 'exang']")
Continuous variables %s
" ['oldpeak']"
Discrete string variables %s
' []'
Date and time variables %s
' []'
ID variables %s
' []'
Target variable %s
' target'
All Plots are saved in .\AutoViz_Plots\target
Time to run AutoViz = 7 seconds
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 807 | 44 | 1 | 2 | 130 | 233 | 0 | 1 | 179 | 1 | 0.4 | 2 | 0 | 2 | 0 |
| 27 | 58 | 0 | 1 | 136 | 319 | 1 | 0 | 152 | 0 | 0.0 | 2 | 2 | 2 | 1 |
| 77 | 63 | 1 | 0 | 140 | 187 | 0 | 0 | 144 | 1 | 4.0 | 2 | 2 | 3 | 1 |
| 406 | 58 | 1 | 2 | 140 | 211 | 1 | 0 | 165 | 0 | 0.0 | 2 | 0 | 2 | 0 |
| 886 | 61 | 1 | 0 | 120 | 260 | 0 | 1 | 140 | 1 | 3.6 | 1 | 1 | 3 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 835 | 49 | 1 | 2 | 118 | 149 | 0 | 0 | 126 | 0 | 0.8 | 2 | 3 | 2 | 1 |
| 192 | 67 | 0 | 2 | 115 | 564 | 0 | 0 | 160 | 0 | 1.6 | 1 | 0 | 3 | 0 |
| 629 | 65 | 1 | 3 | 138 | 282 | 1 | 0 | 174 | 0 | 1.4 | 1 | 1 | 2 | 1 |
| 559 | 67 | 1 | 0 | 120 | 237 | 0 | 1 | 71 | 0 | 1.0 | 1 | 0 | 2 | 1 |
| 684 | 60 | 1 | 2 | 140 | 185 | 0 | 0 | 155 | 0 | 3.0 | 1 | 0 | 2 | 1 |
1025 rows × 14 columns
# 2. png 타입의 파일로 저장
# 저장 위치도 지정 가능
AV.AutoViz(filename = '',
dfte = df, # 데이터 프레임을 입력 데이터로 사용 시 지정
depVar = 'target', # 타겟 변수 지정
verbose = 2, # 0: 간단히 표시, 1: 자세히 표시, 2: 파일로 저장
max_rows_analyzed = df.shape[0], # 분석에 사용되는 최대 row 수 지정
max_cols_analyzed = df.shape[1], # 분석에 사용되는 최대 column(feature) 수 지정
chart_format = "png", # 생성(저장)되는 차트의 파일 포맷(확장자)
save_plot_dir = "./plots_autoviz2_png") # 저장되는 폴더 지정
Shape of your Data Set loaded: (1025, 14) ####################################################################################### ######################## C L A S S I F Y I N G V A R I A B L E S #################### ####################################################################################### Classifying variables in data set... Data cleaning improvement suggestions. Complete them before proceeding to ML modeling.
| Nuniques | dtype | Nulls | Nullpercent | NuniquePercent | Value counts Min | Data cleaning improvement suggestions | |
|---|---|---|---|---|---|---|---|
| chol | 152 | int64 | 0 | 0.000000 | 14.829268 | 0 | |
| thalach | 91 | int64 | 0 | 0.000000 | 8.878049 | 0 | |
| trestbps | 49 | int64 | 0 | 0.000000 | 4.780488 | 0 | |
| age | 41 | int64 | 0 | 0.000000 | 4.000000 | 0 | |
| oldpeak | 40 | float64 | 0 | 0.000000 | 3.902439 | 0 | skewed: cap or drop outliers |
| ca | 5 | int64 | 0 | 0.000000 | 0.487805 | 0 | |
| cp | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| thal | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| restecg | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| slope | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| sex | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| fbs | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| exang | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 |
Printing upto 30 columns max in each category:
Numeric Columns : ['oldpeak']
Integer-Categorical Columns: ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', 'thal']
String-Categorical Columns: []
Factor-Categorical Columns: []
String-Boolean Columns: []
Numeric-Boolean Columns: ['sex', 'fbs', 'exang']
Discrete String Columns: []
NLP text Columns: []
Date Time Columns: []
ID Columns: []
Columns that will not be considered in modeling: []
13 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 1025 exceeds maximum, randomly sampling 1025 rows for EDA...
################ Binary_Classification problem #####################
Columns to delete:
' []'
Boolean variables %s
" ['sex', 'fbs', 'exang']"
Categorical variables %s
(" ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', "
"'thal', 'sex', 'fbs', 'exang']")
Continuous variables %s
" ['oldpeak']"
Discrete string variables %s
' []'
Date and time variables %s
' []'
ID variables %s
' []'
Target variable %s
' target'
All Plots are saved in ./plots_autoviz2_png\target
Time to run AutoViz = 7 seconds
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 807 | 44 | 1 | 2 | 130 | 233 | 0 | 1 | 179 | 1 | 0.4 | 2 | 0 | 2 | 0 |
| 27 | 58 | 0 | 1 | 136 | 319 | 1 | 0 | 152 | 0 | 0.0 | 2 | 2 | 2 | 1 |
| 77 | 63 | 1 | 0 | 140 | 187 | 0 | 0 | 144 | 1 | 4.0 | 2 | 2 | 3 | 1 |
| 406 | 58 | 1 | 2 | 140 | 211 | 1 | 0 | 165 | 0 | 0.0 | 2 | 0 | 2 | 0 |
| 886 | 61 | 1 | 0 | 120 | 260 | 0 | 1 | 140 | 1 | 3.6 | 1 | 1 | 3 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 835 | 49 | 1 | 2 | 118 | 149 | 0 | 0 | 126 | 0 | 0.8 | 2 | 3 | 2 | 1 |
| 192 | 67 | 0 | 2 | 115 | 564 | 0 | 0 | 160 | 0 | 1.6 | 1 | 0 | 3 | 0 |
| 629 | 65 | 1 | 3 | 138 | 282 | 1 | 0 | 174 | 0 | 1.4 | 1 | 1 | 2 | 1 |
| 559 | 67 | 1 | 0 | 120 | 237 | 0 | 1 | 71 | 0 | 1.0 | 1 | 0 | 2 | 1 |
| 684 | 60 | 1 | 2 | 140 | 185 | 0 | 0 | 155 | 0 | 3.0 | 1 | 0 | 2 | 1 |
1025 rows × 14 columns
# 3. 지정한 폴더에 html 파일로 interactive graph 저장
AV.AutoViz(filename = '',
dfte = df,
depVar = 'target',
verbose = 2, # 0: 간단히 표시, 1: 자세히 표시, 2: 파일로 저장
max_rows_analyzed = df.shape[0],
max_cols_analyzed = df.shape[1],
chart_format = "html",
save_plot_dir = "./plots_autoviz3_html")
Shape of your Data Set loaded: (1025, 14) ####################################################################################### ######################## C L A S S I F Y I N G V A R I A B L E S #################### ####################################################################################### Classifying variables in data set... Data cleaning improvement suggestions. Complete them before proceeding to ML modeling.
| Nuniques | dtype | Nulls | Nullpercent | NuniquePercent | Value counts Min | Data cleaning improvement suggestions | |
|---|---|---|---|---|---|---|---|
| chol | 152 | int64 | 0 | 0.000000 | 14.829268 | 0 | |
| thalach | 91 | int64 | 0 | 0.000000 | 8.878049 | 0 | |
| trestbps | 49 | int64 | 0 | 0.000000 | 4.780488 | 0 | |
| age | 41 | int64 | 0 | 0.000000 | 4.000000 | 0 | |
| oldpeak | 40 | float64 | 0 | 0.000000 | 3.902439 | 0 | skewed: cap or drop outliers |
| ca | 5 | int64 | 0 | 0.000000 | 0.487805 | 0 | |
| cp | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| thal | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| restecg | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| slope | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| sex | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| fbs | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| exang | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 |
Printing upto 30 columns max in each category:
Numeric Columns : ['oldpeak']
Integer-Categorical Columns: ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', 'thal']
String-Categorical Columns: []
Factor-Categorical Columns: []
String-Boolean Columns: []
Numeric-Boolean Columns: ['sex', 'fbs', 'exang']
Discrete String Columns: []
NLP text Columns: []
Date Time Columns: []
ID Columns: []
Columns that will not be considered in modeling: []
13 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 1025 exceeds maximum, randomly sampling 1025 rows for EDA...
################ Binary_Classification problem #####################
Columns to delete:
' []'
Boolean variables %s
" ['sex', 'fbs', 'exang']"
Categorical variables %s
(" ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', "
"'thal', 'sex', 'fbs', 'exang']")
Continuous variables %s
" ['oldpeak']"
Discrete string variables %s
' []'
Date and time variables %s
' []'
ID variables %s
' []'
Target variable %s
' target'
Saving scatterplots in HTML format
Saving distplots_cats in HTML format
Saving distplots_nums in HTML format
Saving kde_plots in HTML format
Saving violinplots in HTML format
Saving cat_var_plots in HTML format
Time to run AutoViz (in seconds) = 2
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 807 | 44 | 1 | 2 | 130 | 233 | 0 | 1 | 179 | 1 | 0.4 | 2 | 0 | 2 | 1 |
| 27 | 58 | 0 | 1 | 136 | 319 | 1 | 0 | 152 | 0 | 0.0 | 2 | 2 | 2 | 0 |
| 77 | 63 | 1 | 0 | 140 | 187 | 0 | 0 | 144 | 1 | 4.0 | 2 | 2 | 3 | 0 |
| 406 | 58 | 1 | 2 | 140 | 211 | 1 | 0 | 165 | 0 | 0.0 | 2 | 0 | 2 | 1 |
| 886 | 61 | 1 | 0 | 120 | 260 | 0 | 1 | 140 | 1 | 3.6 | 1 | 1 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 835 | 49 | 1 | 2 | 118 | 149 | 0 | 0 | 126 | 0 | 0.8 | 2 | 3 | 2 | 0 |
| 192 | 67 | 0 | 2 | 115 | 564 | 0 | 0 | 160 | 0 | 1.6 | 1 | 0 | 3 | 1 |
| 629 | 65 | 1 | 3 | 138 | 282 | 1 | 0 | 174 | 0 | 1.4 | 1 | 1 | 2 | 0 |
| 559 | 67 | 1 | 0 | 120 | 237 | 0 | 1 | 71 | 0 | 1.0 | 1 | 0 | 2 | 0 |
| 684 | 60 | 1 | 2 | 140 | 185 | 0 | 0 | 155 | 0 | 3.0 | 1 | 0 | 2 | 0 |
1025 rows × 14 columns
# 4. lowess 옵션 사용 jpg 저장
AV.AutoViz(filename = '',
dfte = df,
depVar = 'target',
verbose = 2, # 0: 간단히 표시, 1: 자세히 표시, 2: 파일로 저장
max_rows_analyzed = df.shape[0],
max_cols_analyzed = df.shape[1],
lowess = True,
chart_format = "jpg",
save_plot_dir = "./plots_autoviz4_jpg")
Shape of your Data Set loaded: (1025, 14) ####################################################################################### ######################## C L A S S I F Y I N G V A R I A B L E S #################### ####################################################################################### Classifying variables in data set... Data cleaning improvement suggestions. Complete them before proceeding to ML modeling.
| Nuniques | dtype | Nulls | Nullpercent | NuniquePercent | Value counts Min | Data cleaning improvement suggestions | |
|---|---|---|---|---|---|---|---|
| chol | 152 | int64 | 0 | 0.000000 | 14.829268 | 0 | |
| thalach | 91 | int64 | 0 | 0.000000 | 8.878049 | 0 | |
| trestbps | 49 | int64 | 0 | 0.000000 | 4.780488 | 0 | |
| age | 41 | int64 | 0 | 0.000000 | 4.000000 | 0 | |
| oldpeak | 40 | float64 | 0 | 0.000000 | 3.902439 | 0 | skewed: cap or drop outliers |
| ca | 5 | int64 | 0 | 0.000000 | 0.487805 | 0 | |
| cp | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| thal | 4 | int64 | 0 | 0.000000 | 0.390244 | 0 | |
| restecg | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| slope | 3 | int64 | 0 | 0.000000 | 0.292683 | 0 | |
| sex | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| fbs | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 | |
| exang | 2 | int64 | 0 | 0.000000 | 0.195122 | 0 |
Printing upto 30 columns max in each category:
Numeric Columns : ['oldpeak']
Integer-Categorical Columns: ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', 'thal']
String-Categorical Columns: []
Factor-Categorical Columns: []
String-Boolean Columns: []
Numeric-Boolean Columns: ['sex', 'fbs', 'exang']
Discrete String Columns: []
NLP text Columns: []
Date Time Columns: []
ID Columns: []
Columns that will not be considered in modeling: []
13 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 1025 exceeds maximum, randomly sampling 1025 rows for EDA...
################ Binary_Classification problem #####################
Columns to delete:
' []'
Boolean variables %s
" ['sex', 'fbs', 'exang']"
Categorical variables %s
(" ['age', 'cp', 'trestbps', 'chol', 'restecg', 'thalach', 'slope', 'ca', "
"'thal', 'sex', 'fbs', 'exang']")
Continuous variables %s
" ['oldpeak']"
Discrete string variables %s
' []'
Date and time variables %s
' []'
ID variables %s
' []'
Target variable %s
' target'
All Plots are saved in ./plots_autoviz4_jpg\target
Time to run AutoViz = 8 seconds
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 807 | 44 | 1 | 2 | 130 | 233 | 0 | 1 | 179 | 1 | 0.4 | 2 | 0 | 2 | 0 |
| 27 | 58 | 0 | 1 | 136 | 319 | 1 | 0 | 152 | 0 | 0.0 | 2 | 2 | 2 | 1 |
| 77 | 63 | 1 | 0 | 140 | 187 | 0 | 0 | 144 | 1 | 4.0 | 2 | 2 | 3 | 1 |
| 406 | 58 | 1 | 2 | 140 | 211 | 1 | 0 | 165 | 0 | 0.0 | 2 | 0 | 2 | 0 |
| 886 | 61 | 1 | 0 | 120 | 260 | 0 | 1 | 140 | 1 | 3.6 | 1 | 1 | 3 | 1 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 835 | 49 | 1 | 2 | 118 | 149 | 0 | 0 | 126 | 0 | 0.8 | 2 | 3 | 2 | 1 |
| 192 | 67 | 0 | 2 | 115 | 564 | 0 | 0 | 160 | 0 | 1.6 | 1 | 0 | 3 | 0 |
| 629 | 65 | 1 | 3 | 138 | 282 | 1 | 0 | 174 | 0 | 1.4 | 1 | 1 | 2 | 1 |
| 559 | 67 | 1 | 0 | 120 | 237 | 0 | 1 | 71 | 0 | 1.0 | 1 | 0 | 2 | 1 |
| 684 | 60 | 1 | 2 | 140 | 185 | 0 | 0 | 155 | 0 | 3.0 | 1 | 0 | 2 | 1 |
1025 rows × 14 columns
# 5. 주피터 노트북 내 interactive graph 생성
AV.AutoViz(filename = '',
dfte = df,
depVar = 'target',
verbose = 0, # 차트 포맷을 bekeh나 server, html 등으로 생성할 때는 0과 1 권장
max_rows_analyzed = df.shape[0],
max_cols_analyzed = df.shape[1],
chart_format = "bokeh")
Shape of your Data Set loaded: (1025, 14)
#######################################################################################
######################## C L A S S I F Y I N G V A R I A B L E S ####################
#######################################################################################
Classifying variables in data set...
Number of Numeric Columns = 1
Number of Integer-Categorical Columns = 9
Number of String-Categorical Columns = 0
Number of Factor-Categorical Columns = 0
Number of String-Boolean Columns = 0
Number of Numeric-Boolean Columns = 3
Number of Discrete String Columns = 0
Number of NLP String Columns = 0
Number of Date Time Columns = 0
Number of ID Columns = 0
Number of Columns to Delete = 0
13 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 1025 exceeds maximum, randomly sampling 1025 rows for EDA...
################ Binary_Classification problem #####################
Time to run AutoViz (in seconds) = 1
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 807 | 44 | 1 | 2 | 130 | 233 | 0 | 1 | 179 | 1 | 0.4 | 2 | 0 | 2 | 1 |
| 27 | 58 | 0 | 1 | 136 | 319 | 1 | 0 | 152 | 0 | 0.0 | 2 | 2 | 2 | 0 |
| 77 | 63 | 1 | 0 | 140 | 187 | 0 | 0 | 144 | 1 | 4.0 | 2 | 2 | 3 | 0 |
| 406 | 58 | 1 | 2 | 140 | 211 | 1 | 0 | 165 | 0 | 0.0 | 2 | 0 | 2 | 1 |
| 886 | 61 | 1 | 0 | 120 | 260 | 0 | 1 | 140 | 1 | 3.6 | 1 | 1 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 835 | 49 | 1 | 2 | 118 | 149 | 0 | 0 | 126 | 0 | 0.8 | 2 | 3 | 2 | 0 |
| 192 | 67 | 0 | 2 | 115 | 564 | 0 | 0 | 160 | 0 | 1.6 | 1 | 0 | 3 | 1 |
| 629 | 65 | 1 | 3 | 138 | 282 | 1 | 0 | 174 | 0 | 1.4 | 1 | 1 | 2 | 0 |
| 559 | 67 | 1 | 0 | 120 | 237 | 0 | 1 | 71 | 0 | 1.0 | 1 | 0 | 2 | 0 |
| 684 | 60 | 1 | 2 | 140 | 185 | 0 | 0 | 155 | 0 | 3.0 | 1 | 0 | 2 | 0 |
1025 rows × 14 columns
# 6. 팝업창으로 interactive graph 생성
AV.AutoViz(filename = '',
dfte = df,
depVar = 'target',
verbose = 0, # 차트 포맷을 bekeh나 server, html 등으로 생성할 때는 0과 1 권장
max_rows_analyzed = df.shape[0],
max_cols_analyzed = df.shape[1],
chart_format = "server")
Shape of your Data Set loaded: (1025, 14)
#######################################################################################
######################## C L A S S I F Y I N G V A R I A B L E S ####################
#######################################################################################
Classifying variables in data set...
Number of Numeric Columns = 1
Number of Integer-Categorical Columns = 9
Number of String-Categorical Columns = 0
Number of Factor-Categorical Columns = 0
Number of String-Boolean Columns = 0
Number of Numeric-Boolean Columns = 3
Number of Discrete String Columns = 0
Number of NLP String Columns = 0
Number of Date Time Columns = 0
Number of ID Columns = 0
Number of Columns to Delete = 0
13 Predictors classified...
No variables removed since no ID or low-information variables found in data set
Since Number of Rows in data 1025 exceeds maximum, randomly sampling 1025 rows for EDA...
################ Binary_Classification problem #####################
scatterplots can be found in URL below:
Launching server at http://localhost:54106
distplots can be found in URL below:
Launching server at http://localhost:54107
distplots can be found in URL below:
Launching server at http://localhost:54108
kde_plots can be found in URL below:
Launching server at http://localhost:54109
violinplots can be found in URL below:
Launching server at http://localhost:54110
cat_var_plots can be found in URL below:
Launching server at http://localhost:54111
Time to run AutoViz (in seconds) = 0
| age | sex | cp | trestbps | chol | fbs | restecg | thalach | exang | oldpeak | slope | ca | thal | target | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 807 | 44 | 1 | 2 | 130 | 233 | 0 | 1 | 179 | 1 | 0.4 | 2 | 0 | 2 | 1 |
| 27 | 58 | 0 | 1 | 136 | 319 | 1 | 0 | 152 | 0 | 0.0 | 2 | 2 | 2 | 0 |
| 77 | 63 | 1 | 0 | 140 | 187 | 0 | 0 | 144 | 1 | 4.0 | 2 | 2 | 3 | 0 |
| 406 | 58 | 1 | 2 | 140 | 211 | 1 | 0 | 165 | 0 | 0.0 | 2 | 0 | 2 | 1 |
| 886 | 61 | 1 | 0 | 120 | 260 | 0 | 1 | 140 | 1 | 3.6 | 1 | 1 | 3 | 0 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 835 | 49 | 1 | 2 | 118 | 149 | 0 | 0 | 126 | 0 | 0.8 | 2 | 3 | 2 | 0 |
| 192 | 67 | 0 | 2 | 115 | 564 | 0 | 0 | 160 | 0 | 1.6 | 1 | 0 | 3 | 1 |
| 629 | 65 | 1 | 3 | 138 | 282 | 1 | 0 | 174 | 0 | 1.4 | 1 | 1 | 2 | 0 |
| 559 | 67 | 1 | 0 | 120 | 237 | 0 | 1 | 71 | 0 | 1.0 | 1 | 0 | 2 | 0 |
| 684 | 60 | 1 | 2 | 140 | 185 | 0 | 0 | 155 | 0 | 3.0 | 1 | 0 | 2 | 0 |
1025 rows × 14 columns
y_data = df.pop('target')
x_data = df
print(x_data.shape)
print(y_data.shape)
(1025, 13) (1025,)
import sklearn
print(sklearn.__version__)
1.0.2
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x_data,
y_data,
test_size = 0.2,
random_state = 2022,
stratify = y_data) # 클래스 비율을 동일하게 분리한다.
print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)
(820, 13) (820,) (205, 13) (205,)
from lazypredict.Supervised import LazyClassifier
clf = LazyClassifier(verbose = 0, predictions = True)
models, predictions = clf.fit(x_train, x_test, y_train, y_test)
models
100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 40.81it/s]
| Accuracy | Balanced Accuracy | ROC AUC | F1 Score | Time Taken | |
|---|---|---|---|---|---|
| Model | |||||
| LGBMClassifier | 1.00 | 1.00 | 1.00 | 1.00 | 0.07 |
| LabelPropagation | 1.00 | 1.00 | 1.00 | 1.00 | 0.02 |
| XGBClassifier | 1.00 | 1.00 | 1.00 | 1.00 | 0.10 |
| DecisionTreeClassifier | 1.00 | 1.00 | 1.00 | 1.00 | 0.01 |
| RandomForestClassifier | 1.00 | 1.00 | 1.00 | 1.00 | 0.08 |
| ExtraTreeClassifier | 1.00 | 1.00 | 1.00 | 1.00 | 0.00 |
| ExtraTreesClassifier | 1.00 | 1.00 | 1.00 | 1.00 | 0.06 |
| BaggingClassifier | 1.00 | 1.00 | 1.00 | 1.00 | 0.02 |
| LabelSpreading | 1.00 | 1.00 | 1.00 | 1.00 | 0.03 |
| SVC | 0.94 | 0.94 | 0.94 | 0.94 | 0.02 |
| AdaBoostClassifier | 0.92 | 0.92 | 0.92 | 0.92 | 0.06 |
| NuSVC | 0.90 | 0.90 | 0.90 | 0.90 | 0.02 |
| QuadraticDiscriminantAnalysis | 0.89 | 0.89 | 0.89 | 0.89 | 0.01 |
| GaussianNB | 0.88 | 0.88 | 0.88 | 0.88 | 0.00 |
| SGDClassifier | 0.87 | 0.87 | 0.87 | 0.87 | 0.01 |
| CalibratedClassifierCV | 0.87 | 0.87 | 0.87 | 0.87 | 0.06 |
| LogisticRegression | 0.86 | 0.86 | 0.86 | 0.86 | 0.01 |
| LinearSVC | 0.86 | 0.86 | 0.86 | 0.86 | 0.03 |
| NearestCentroid | 0.86 | 0.86 | 0.86 | 0.86 | 0.01 |
| LinearDiscriminantAnalysis | 0.86 | 0.86 | 0.86 | 0.86 | 0.02 |
| RidgeClassifier | 0.86 | 0.86 | 0.86 | 0.86 | 0.01 |
| RidgeClassifierCV | 0.86 | 0.86 | 0.86 | 0.86 | 0.01 |
| BernoulliNB | 0.85 | 0.85 | 0.85 | 0.85 | 0.01 |
| KNeighborsClassifier | 0.83 | 0.83 | 0.83 | 0.83 | 0.01 |
| PassiveAggressiveClassifier | 0.78 | 0.78 | 0.78 | 0.78 | 0.01 |
| Perceptron | 0.76 | 0.76 | 0.76 | 0.76 | 0.01 |
| DummyClassifier | 0.51 | 0.50 | 0.50 | 0.35 | 0.00 |
predictions.head()
| AdaBoostClassifier | BaggingClassifier | BernoulliNB | CalibratedClassifierCV | DecisionTreeClassifier | DummyClassifier | ExtraTreeClassifier | ExtraTreesClassifier | GaussianNB | KNeighborsClassifier | ... | PassiveAggressiveClassifier | Perceptron | QuadraticDiscriminantAnalysis | RandomForestClassifier | RidgeClassifier | RidgeClassifierCV | SGDClassifier | SVC | XGBClassifier | LGBMClassifier | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | ... | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 1 | 1 | 1 | 0 | 1 | 1 | 1 | 1 | 1 | 1 | 0 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 2 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 3 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | ... | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 | 1 |
| 4 | 1 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 1 | 1 | ... | 1 | 1 | 0 | 0 | 1 | 1 | 1 | 0 | 0 | 0 |
5 rows × 27 columns
from sklearn.metrics import classification_report
for model_name in predictions.columns.tolist():
print(f'{model_name}')
print(classification_report(y_test, predictions[model_name]))
AdaBoostClassifier
precision recall f1-score support
0 0.95 0.89 0.92 100
1 0.90 0.95 0.93 105
accuracy 0.92 205
macro avg 0.92 0.92 0.92 205
weighted avg 0.92 0.92 0.92 205
BaggingClassifier
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
BernoulliNB
precision recall f1-score support
0 0.88 0.80 0.84 100
1 0.82 0.90 0.86 105
accuracy 0.85 205
macro avg 0.85 0.85 0.85 205
weighted avg 0.85 0.85 0.85 205
CalibratedClassifierCV
precision recall f1-score support
0 0.95 0.78 0.86 100
1 0.82 0.96 0.89 105
accuracy 0.87 205
macro avg 0.89 0.87 0.87 205
weighted avg 0.88 0.87 0.87 205
DecisionTreeClassifier
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
DummyClassifier
precision recall f1-score support
0 0.00 0.00 0.00 100
1 0.51 1.00 0.68 105
accuracy 0.51 205
macro avg 0.26 0.50 0.34 205
weighted avg 0.26 0.51 0.35 205
ExtraTreeClassifier
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
ExtraTreesClassifier
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
GaussianNB
precision recall f1-score support
0 0.89 0.85 0.87 100
1 0.86 0.90 0.88 105
accuracy 0.88 205
macro avg 0.88 0.88 0.88 205
weighted avg 0.88 0.88 0.88 205
KNeighborsClassifier
precision recall f1-score support
0 0.85 0.80 0.82 100
1 0.82 0.87 0.84 105
accuracy 0.83 205
macro avg 0.84 0.83 0.83 205
weighted avg 0.84 0.83 0.83 205
LabelPropagation
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
LabelSpreading
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
LinearDiscriminantAnalysis
precision recall f1-score support
0 0.96 0.74 0.84 100
1 0.80 0.97 0.88 105
accuracy 0.86 205
macro avg 0.88 0.86 0.86 205
weighted avg 0.88 0.86 0.86 205
LinearSVC
precision recall f1-score support
0 0.95 0.76 0.84 100
1 0.81 0.96 0.88 105
accuracy 0.86 205
macro avg 0.88 0.86 0.86 205
weighted avg 0.88 0.86 0.86 205
LogisticRegression
precision recall f1-score support
0 0.93 0.78 0.85 100
1 0.82 0.94 0.88 105
accuracy 0.86 205
macro avg 0.87 0.86 0.86 205
weighted avg 0.87 0.86 0.86 205
NearestCentroid
precision recall f1-score support
0 0.94 0.76 0.84 100
1 0.81 0.95 0.87 105
accuracy 0.86 205
macro avg 0.87 0.86 0.86 205
weighted avg 0.87 0.86 0.86 205
NuSVC
precision recall f1-score support
0 0.94 0.85 0.89 100
1 0.87 0.95 0.91 105
accuracy 0.90 205
macro avg 0.91 0.90 0.90 205
weighted avg 0.91 0.90 0.90 205
PassiveAggressiveClassifier
precision recall f1-score support
0 0.77 0.77 0.77 100
1 0.78 0.78 0.78 105
accuracy 0.78 205
macro avg 0.78 0.78 0.78 205
weighted avg 0.78 0.78 0.78 205
Perceptron
precision recall f1-score support
0 0.76 0.74 0.75 100
1 0.76 0.77 0.76 105
accuracy 0.76 205
macro avg 0.76 0.76 0.76 205
weighted avg 0.76 0.76 0.76 205
QuadraticDiscriminantAnalysis
precision recall f1-score support
0 0.91 0.87 0.89 100
1 0.88 0.91 0.90 105
accuracy 0.89 205
macro avg 0.89 0.89 0.89 205
weighted avg 0.89 0.89 0.89 205
RandomForestClassifier
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
RidgeClassifier
precision recall f1-score support
0 0.96 0.74 0.84 100
1 0.80 0.97 0.88 105
accuracy 0.86 205
macro avg 0.88 0.86 0.86 205
weighted avg 0.88 0.86 0.86 205
RidgeClassifierCV
precision recall f1-score support
0 0.96 0.74 0.84 100
1 0.80 0.97 0.88 105
accuracy 0.86 205
macro avg 0.88 0.86 0.86 205
weighted avg 0.88 0.86 0.86 205
SGDClassifier
precision recall f1-score support
0 0.91 0.82 0.86 100
1 0.84 0.92 0.88 105
accuracy 0.87 205
macro avg 0.88 0.87 0.87 205
weighted avg 0.88 0.87 0.87 205
SVC
precision recall f1-score support
0 0.97 0.90 0.93 100
1 0.91 0.97 0.94 105
accuracy 0.94 205
macro avg 0.94 0.94 0.94 205
weighted avg 0.94 0.94 0.94 205
XGBClassifier
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
LGBMClassifier
precision recall f1-score support
0 1.00 1.00 1.00 100
1 1.00 1.00 1.00 105
accuracy 1.00 205
macro avg 1.00 1.00 1.00 205
weighted avg 1.00 1.00 1.00 205
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score
lgbm = LGBMClassifier()
lgbm.fit(x_train, y_train)
y_pred = lgbm.predict(x_test)
accuracy_score(y_pred, y_test)
1.0
파라미터 자동 튜닝
SVM 튜닝없이 돌렸을 때
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
X_train = pd.DataFrame(x_train)
X_test = pd.DataFrame(x_test)
pipe = Pipeline(steps = [("preprocessor", StandardScaler()), ("classifier", SVC())])
pipe.fit(x_train, y_train)
y_pred = pipe.predict(x_test)
accuracy_score(y_pred, y_test)
0.9365853658536586
pipe['classifier'].get_params()
{'C': 1.0,
'break_ties': False,
'cache_size': 200,
'class_weight': None,
'coef0': 0.0,
'decision_function_shape': 'ovr',
'degree': 3,
'gamma': 'scale',
'kernel': 'rbf',
'max_iter': -1,
'probability': False,
'random_state': None,
'shrinking': True,
'tol': 0.001,
'verbose': False}
from skopt import BayesSearchCV
pipe = Pipeline(steps = [("preprocessor", StandardScaler()), ("classifier", SVC())])
opt = BayesSearchCV(
pipe,
{
'classifier__C': (1e-1, 1e+1, 'log-uniform'),
'classifier__gamma': (1e-6, 1e+1, 'log-uniform'),
'classifier__degree': (1, 8), # integer valued parameter
'classifier__kernel': ['linear', 'poly', 'rbf'], # categorical parameter
},
n_iter = 8,
cv = 3
)
opt.fit(x_train, y_train)
print("val. score: %s" % opt.best_score_)
print("test score: %s " % opt.score(x_test, y_test))
val. score: 0.9670686167036533 test score: 1.0
opt.best_params_
OrderedDict([('classifier__C', 0.3190547286265487),
('classifier__degree', 4),
('classifier__gamma', 0.632736824293571),
('classifier__kernel', 'poly')])
import lazypredict
from lazypredict.Supervised import LazyClassifier
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
# X data, Y data split
X = data.data
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .5, random_state = 123)
clf = LazyClassifier(verbose = 0, ignore_warnings = True, custom_metric = None)
models, predictions = clf.fit(X_train, X_test, y_train, y_test)
models
100%|██████████████████████████████████████████████████████████████████████████████████| 29/29 [00:00<00:00, 57.18it/s]
| Accuracy | Balanced Accuracy | ROC AUC | F1 Score | Time Taken | |
|---|---|---|---|---|---|
| Model | |||||
| LinearSVC | 0.99 | 0.99 | 0.99 | 0.99 | 0.01 |
| Perceptron | 0.99 | 0.98 | 0.98 | 0.99 | 0.00 |
| LogisticRegression | 0.99 | 0.98 | 0.98 | 0.99 | 0.01 |
| SVC | 0.98 | 0.98 | 0.98 | 0.98 | 0.01 |
| XGBClassifier | 0.98 | 0.98 | 0.98 | 0.98 | 0.07 |
| LabelPropagation | 0.98 | 0.97 | 0.97 | 0.98 | 0.01 |
| LabelSpreading | 0.98 | 0.97 | 0.97 | 0.98 | 0.01 |
| BaggingClassifier | 0.97 | 0.97 | 0.97 | 0.97 | 0.02 |
| PassiveAggressiveClassifier | 0.98 | 0.97 | 0.97 | 0.98 | 0.00 |
| SGDClassifier | 0.98 | 0.97 | 0.97 | 0.98 | 0.01 |
| RandomForestClassifier | 0.97 | 0.97 | 0.97 | 0.97 | 0.08 |
| CalibratedClassifierCV | 0.98 | 0.97 | 0.97 | 0.98 | 0.02 |
| LGBMClassifier | 0.97 | 0.97 | 0.97 | 0.97 | 0.05 |
| QuadraticDiscriminantAnalysis | 0.96 | 0.97 | 0.97 | 0.97 | 0.01 |
| ExtraTreesClassifier | 0.97 | 0.96 | 0.96 | 0.97 | 0.06 |
| RidgeClassifierCV | 0.97 | 0.96 | 0.96 | 0.97 | 0.01 |
| RidgeClassifier | 0.97 | 0.96 | 0.96 | 0.97 | 0.01 |
| AdaBoostClassifier | 0.96 | 0.96 | 0.96 | 0.96 | 0.06 |
| KNeighborsClassifier | 0.96 | 0.96 | 0.96 | 0.96 | 0.01 |
| BernoulliNB | 0.95 | 0.95 | 0.95 | 0.95 | 0.01 |
| LinearDiscriminantAnalysis | 0.96 | 0.95 | 0.95 | 0.96 | 0.01 |
| GaussianNB | 0.95 | 0.95 | 0.95 | 0.95 | 0.00 |
| NuSVC | 0.95 | 0.94 | 0.94 | 0.95 | 0.01 |
| ExtraTreeClassifier | 0.94 | 0.93 | 0.93 | 0.94 | 0.01 |
| NearestCentroid | 0.95 | 0.93 | 0.93 | 0.95 | 0.01 |
| DecisionTreeClassifier | 0.93 | 0.93 | 0.93 | 0.93 | 0.01 |
| DummyClassifier | 0.64 | 0.50 | 0.50 | 0.50 | 0.00 |
from lazypredict.Supervised import LazyRegressor
from sklearn import datasets
from sklearn.utils import shuffle
import numpy as np
boston = datasets.load_boston()
X, y = shuffle(boston.data, boston.target, random_state = 13)
X = X.astype(np.float32)
offset = int(X.shape[0] * 0.9)
X_train, y_train = X[:offset], y[:offset]
X_test, y_test = X[offset:], y[offset:]
reg = LazyRegressor(verbose = 0, ignore_warnings = False, custom_metric = None)
models2, predictions2 = reg.fit(X_train, X_test, y_train, y_test)
14%|███████████▊ | 6/42 [00:00<00:00, 56.45it/s]
ElasticNetCV model failed to execute Gram matrix passed in via 'precompute' parameter did not pass validation when a single element was checked - please check that it was computed properly. For element (6,7) we computed -263.0660400390625 but the user-supplied value was -263.06610107421875.
60%|████████████████████████████████████████████████▊ | 25/42 [00:00<00:00, 29.42it/s]
LassoCV model failed to execute Gram matrix passed in via 'precompute' parameter did not pass validation when a single element was checked - please check that it was computed properly. For element (6,7) we computed -263.0660400390625 but the user-supplied value was -263.06610107421875.
100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [00:03<00:00, 12.82it/s]
models2
| Adjusted R-Squared | R-Squared | RMSE | Time Taken | |
|---|---|---|---|---|
| Model | ||||
| SVR | 0.83 | 0.88 | 2.62 | 0.01 |
| BaggingRegressor | 0.83 | 0.88 | 2.63 | 0.02 |
| NuSVR | 0.82 | 0.86 | 2.76 | 0.01 |
| RandomForestRegressor | 0.81 | 0.86 | 2.78 | 0.19 |
| XGBRegressor | 0.81 | 0.86 | 2.79 | 0.08 |
| GradientBoostingRegressor | 0.81 | 0.86 | 2.84 | 0.08 |
| ExtraTreesRegressor | 0.79 | 0.84 | 2.98 | 0.13 |
| AdaBoostRegressor | 0.78 | 0.83 | 3.04 | 0.06 |
| HistGradientBoostingRegressor | 0.77 | 0.83 | 3.06 | 0.39 |
| PoissonRegressor | 0.77 | 0.83 | 3.11 | 0.01 |
| LGBMRegressor | 0.77 | 0.83 | 3.11 | 0.06 |
| KNeighborsRegressor | 0.77 | 0.83 | 3.12 | 0.01 |
| DecisionTreeRegressor | 0.65 | 0.74 | 3.79 | 0.01 |
| MLPRegressor | 0.65 | 0.74 | 3.80 | 0.35 |
| HuberRegressor | 0.64 | 0.74 | 3.84 | 0.02 |
| GammaRegressor | 0.64 | 0.73 | 3.88 | 0.01 |
| LinearSVR | 0.62 | 0.72 | 3.96 | 0.01 |
| RidgeCV | 0.62 | 0.72 | 3.97 | 0.01 |
| BayesianRidge | 0.62 | 0.72 | 3.97 | 0.00 |
| Ridge | 0.62 | 0.72 | 3.97 | 0.01 |
| LinearRegression | 0.62 | 0.72 | 3.97 | 0.00 |
| TransformedTargetRegressor | 0.62 | 0.72 | 3.97 | 0.00 |
| LassoLarsIC | 0.62 | 0.72 | 3.98 | 0.01 |
| LassoLarsCV | 0.62 | 0.72 | 3.98 | 0.01 |
| Lars | 0.61 | 0.72 | 3.99 | 0.01 |
| LarsCV | 0.61 | 0.71 | 4.02 | 0.03 |
| SGDRegressor | 0.60 | 0.70 | 4.07 | 0.01 |
| TweedieRegressor | 0.59 | 0.70 | 4.12 | 0.00 |
| ElasticNet | 0.58 | 0.69 | 4.16 | 0.01 |
| Lasso | 0.54 | 0.66 | 4.35 | 0.01 |
| RANSACRegressor | 0.53 | 0.65 | 4.41 | 0.04 |
| OrthogonalMatchingPursuitCV | 0.45 | 0.59 | 4.78 | 0.01 |
| PassiveAggressiveRegressor | 0.37 | 0.54 | 5.09 | 0.00 |
| GaussianProcessRegressor | 0.23 | 0.43 | 5.65 | 0.02 |
| OrthogonalMatchingPursuit | 0.16 | 0.38 | 5.89 | 0.00 |
| ExtraTreeRegressor | 0.08 | 0.32 | 6.17 | 0.01 |
| QuantileRegressor | -0.35 | -0.00 | 7.49 | 1.60 |
| DummyRegressor | -0.38 | -0.02 | 7.56 | 0.00 |
| LassoLars | -0.38 | -0.02 | 7.56 | 0.00 |
| KernelRidge | -11.50 | -8.25 | 22.74 | 0.02 |